library(sf)
library(lwgeom)
library(igraph)
library(dplyr)
library(ggplot2)
library(readr)
library(tidyr)

## set this to the appropriate directory
setwd("C:\\Papers\\SF_natural_experiment\\Replication_Files")
options(width=80)
options(tibble.width=80)

## Part 1: match precincts and census tracts ##

## import the precinct geometries
precincts = read_sf("districts87.geojson",
                    stringsAsFactors = F)

## import the Treasure Island shapefile in KML format
ti = read_sf("ti.kml",
             stringsAsFactors = F)

ti = ti %>% 
  mutate(precinct = 2102,
         turnout89 = NA,
         turnout87 = NA,
         Name = NULL, ## assigning NULL drops this column
         Description = NULL, ## and this one
         id = max(precincts$id) + 1)

## rbind() combines rows and now we have all precincts in
## one spatial data frame
precincts = rbind(precincts, ti)

## import the precinct data file
merged_precincts = read_csv("consolidated_precincts.csv")

## split the precinct name and unnest
merged_precincts %>% 
  mutate(p87 = strsplit(Precinct, split="[/ ]+")) %>% 
  unnest(p87) -> unnested_precincts

## do the inner join
precincts %>% inner_join(unnested_precincts %>% 
                           mutate(p87 = as.numeric(p87)),
                         by=c("precinct"="p87")) -> split_precincts

## aggregate the geometries to obtain the merged precinct boundaries.

split_precincts %>% 
  group_by(Precinct) %>% 
  summarise(geometry = st_combine(geometry),
            Turnout89 = first(Turnout89),
            Turnout87 = first(Turnout87),
            Reg87 = first(Reg87),
            Reg89 = first(Reg89)) -> merged_precincts_geometry

## going forward, the merged_precincts_geometry is our main spatial dataframe. The precinct ID is the column Precinct.

## Census blocks obtained from https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.1990.html
## one problem with the census blocks shape file is that it does not have information on the coordinate reference system (CRS). 
## we will assign the same CRS that the precincts have (EPSG 4326)

## import the blocks
blocks = read_sf(dsn = "bg06_d90_shp", ## this is the folder name
                 layer = "bg06_d90", ## you can specify the layer name
                 stringsAsFactors = F)

## assign the same coordinate reference system as for precincts
## EPSG 4326 is the traditional lat/lon system used in other 
## products, so it is a safe choice
st_crs(blocks) = 4326


## ok to subset like this?
blocks <- subset(blocks, CO=="075")

## finding associations between precincts and blocks
## Note: we will get warning messages about st_simplify, but everything is fine

b_overlaps = st_overlaps(x = st_simplify(blocks), 
                     y = st_simplify(merged_precincts_geometry), sparse=F)

b_contains = st_contains(x = st_simplify(blocks), 
                     y = st_simplify(merged_precincts_geometry), sparse=F)


b_within = st_within(x = st_simplify(blocks), 
                     y = st_simplify(merged_precincts_geometry), sparse=F)

b_intersects = st_intersects(x = st_simplify(blocks), 
                             y = st_simplify(merged_precincts_geometry), sparse=F)

## the result is a matrix with 21330 rows (the Census blocks) and 621 columns (the precincts). only 607 blocks in SF, ok to subset?
dim(b_intersects)

blocks$id = 1:nrow(blocks)

rownames(b_overlaps) = blocks$id
rownames(b_contains) = blocks$id
rownames(b_intersects) = blocks$id

merged_precincts_geometry$id = 1:nrow(merged_precincts_geometry)

colnames(b_overlaps) = merged_precincts_geometry$id
colnames(b_contains) = merged_precincts_geometry$id
colnames(b_intersects) = merged_precincts_geometry$id

# now we can proceed to the transformations. 

## graph corresponding to the 'st_overlaps' predicate
g1 = graph_from_incidence_matrix(b_overlaps, directed=T, mode="in")
g1_df = igraph::as_data_frame(g1, what="edges") %>% 
  mutate(type = "overlaps") %>% 
  rename(p_id = from, b_id  = to) %>% 
  mutate(p_id = as.numeric(p_id),
         b_id = as.numeric(b_id))


## graph corresponding to 'st_contains' predicate 
g2 = graph_from_incidence_matrix(b_contains, directed=T, mode="in")
g2_df = igraph::as_data_frame(g2, what="edges") %>% 
  mutate(type = "contains") %>% 
  rename(p_id = from, b_id  = to) %>% 
  mutate(p_id = as.numeric(p_id),
         b_id = as.numeric(b_id))
  

## graph corresponding to `st_intersects' predicate
g3 = graph_from_incidence_matrix(b_intersects, directed=T, mode="in")
g3_df = igraph::as_data_frame(g3, what="edges") %>% 
  mutate(type = "intersects") %>% 
  rename(p_id = from, b_id  = to) %>% 
  mutate(p_id = as.numeric(p_id),
         b_id = as.numeric(b_id))


## we're using only the overlaps and contains matrices, leaving out `intersects` -- change to using all, since 5 blocks fall completely within a precinct
g_df = bind_rows(g1_df, g2_df, g3_df)


# percentage of a block contained within a precinct
# Note: We simplify the geometries because calculation of st_intersection() triggered an error related to self-intersecting geometry
# this is likely due to small errors in digitizing the precinct and block maps 
# Note: we will get warning messages about planar coordinates, but everything is fine

shared_area_pct = numeric(nrow(g_df))

for (j in 1:nrow(g_df)) {
  
  block_index = g_df$b_id[j]
  precinct_index = g_df$p_id[j]
  
  p_area = st_area(merged_precincts_geometry[precinct_index, ])
  b_area = st_area(blocks[block_index, ])
  intersection = st_intersection(
    st_simplify(merged_precincts_geometry[precinct_index, ]),
    st_simplify(blocks[block_index, ]))
  int_area = st_area(intersection)
  
  op_type = g_df$type[j]
  
  result = case_when(
    op_type == "intersects" ~ int_area/b_area,
    op_type == 'contains' ~ p_area/b_area,
    op_type == 'overlaps' ~ int_area/b_area
  )
  
  shared_area_pct[j] = result
  }


g_df %>% mutate(shared_area_pct = shared_area_pct) %>% 
  filter(shared_area_pct > 0) %>% 
  group_by(p_id, b_id) %>% 
  summarise(shared_area_pct = sum(shared_area_pct)) %>% 
  ungroup() -> df

## display for diagnostics

### 1/29/2021 -- looks like we're missing 5 blocks -- 226.3, 17602.3, 17602.2, 17901.4, 106.2
ggplot() + geom_sf(data = merged_precincts_geometry %>% 
                     filter(id %in% unique(df$p_id)),
                   fill="blue", alpha=0.5) +
  geom_sf(data = blocks %>% filter(id %in% unique(df$b_id)),
          fill="orange", alpha=0.5)

# merging with the spatial dataframes of merged precincts and blocks so that we have everything in one place:

df %>% inner_join(as.data.frame(merged_precincts_geometry),
                  by=c("p_id"="id")) %>% 
  inner_join(as.data.frame(blocks),
             by=c("b_id"="id")) -> census.gis

#census.gis <- subset(census.gis, CO=="075")

census.gis$TRACTBNA <- as.numeric(gsub("00", "", census.gis$TRACT))
census.gis$BLCKGR <- census.gis$BG

census.gis$tract_block <- paste(census.gis$TRACT, census.gis$BG, sep=".")
total.pct = tapply(census.gis$shared_area_pct, census.gis$tract_block, sum)
total.pct = as.data.frame(cbind(names(total.pct), total.pct))
colnames(total.pct)[1] = "tract_block"
census.gis = merge(census.gis, total.pct)
census.gis$block.pct = census.gis$shared_area_pct/as.numeric(census.gis$total.pct)

census.gis = subset(census.gis, select= c(TRACTBNA, BLCKGR, Precinct, block.pct))

census.301 = read.csv("census_data/stf301.csv", header= TRUE)
census.301 = subset(census.301, select= c(TRACTBNA, BLCKGR, P0010001, P0070002, P0080001, P0080002, P0080004, P0130027, P0130028, P0130029, P0130030, P0130031))

census.310 = read.csv("census_data/stf310.csv", header= TRUE)
census.310 = subset(census.310, select= c(TRACTBNA, BLCKGR, P0570006, P0570007))

census.314 = read.csv("census_data/stf314.csv", header= TRUE)
census.314 = subset(census.314, select= c(TRACTBNA, BLCKGR, P080A001))

census.data = merge(census.301, census.310)
census.data = merge(census.data, census.314)

all.census.data = merge(census.gis, census.data)

all.census.data$Precinct[all.census.data$TRACTBNA==17902] = 2102  # correction for Treasure Island

# adjust by block pct

all.census.data$P0010001 = all.census.data$P0010001 * all.census.data$block.pct
all.census.data$P0070002 = all.census.data$P0070002 * all.census.data$block.pct
all.census.data$P0080001 = all.census.data$P0080001 * all.census.data$block.pct
all.census.data$P0080002 = all.census.data$P0080002 * all.census.data$block.pct
all.census.data$P0080004 = all.census.data$P0080004 * all.census.data$block.pct
all.census.data$P0130027 = all.census.data$P0130027 * all.census.data$block.pct
all.census.data$P0130028 = all.census.data$P0130028 * all.census.data$block.pct
all.census.data$P0130029 = all.census.data$P0130029 * all.census.data$block.pct
all.census.data$P0130030 = all.census.data$P0130030 * all.census.data$block.pct
all.census.data$P0130031 = all.census.data$P0130031 * all.census.data$block.pct
all.census.data$P0570006 = all.census.data$P0570006 * all.census.data$block.pct
all.census.data$P0570007 = all.census.data$P0570007 * all.census.data$block.pct
all.census.data$P080A001 = all.census.data$P080A001 * all.census.data$P0010001

all.census.data <- subset(all.census.data, select=-c(TRACTBNA, BLCKGR))
precinct.census.data = aggregate(. ~ Precinct, data= all.census.data, FUN= sum)


precinct.census.data$pct_female = precinct.census.data$P0070002/precinct.census.data$P0010001
precinct.census.data$pct_white = precinct.census.data$P0080001/precinct.census.data$P0010001
precinct.census.data$pct_black = precinct.census.data$P0080002/precinct.census.data$P0010001
precinct.census.data$pct_asian = precinct.census.data$P0080004/precinct.census.data$P0010001  # asian or pacific islander
precinct.census.data$pct_over65 = (precinct.census.data$P0130027+precinct.census.data$P0130028+precinct.census.data$P0130029+precinct.census.data$P0130030+precinct.census.data$P0130031)/precinct.census.data$P0010001
precinct.census.data$pct_college = (precinct.census.data$P0570006 + precinct.census.data$P0570007)/precinct.census.data$P0010001
precinct.census.data$med_income = precinct.census.data$P080A001/precinct.census.data$P0010001 # median household income

precinct.census.data$med_income = precinct.census.data$med_income/10000 # median income in 10000s

## Part 3: combining all data ##

precinct.data = read.csv("heat_data.csv", header=TRUE)
precinct.data$turnout.diff = (precinct.data$Turnout89/precinct.data$Reg89) - (precinct.data$Turnout87/precinct.data$Reg87)
polling.place.moved = c("1506/ 1507", "4701", "4716", "4718", "4800", "4806", "4809", "4815/ 4814", "5807", "9401/ 9402", "9419", "9456")
precinct.data$new.pp = as.numeric(precinct.data$Precinct %in% polling.place.moved)
# for alternative placebo test add 4788 = Presidio, 9465 = Lake Merced Park
adjacent.to.moved =  unique(c(
"1505", "1510", "1518", "1521/ 1522",
"4702", "4703", "4700", 
"4717", "4712", "4713", "4715", "4727", "4726", 
"4712", "4711", "4720", "4719", "4717", 
"4801/ 4802", "4803/ 4804", 
"4807/ 4808", "4805", "4803/ 4804", 
"4811", "4789", "4807/ 4808", "4810", 
"4817/ 4816", "4786/ 4787", "4818", "4817/ 4816", 
"5806", "5808", "5900/ 5901", "5903/ 5902", 
"9700", "9701", "9400", 
"9409/ 9410", "9411", "9418", "9429", "9428", "9420", 
"9452", "9455", "9454", "9458/ 9457"
))
precinct.data$placebo = as.numeric(precinct.data$Precinct %in% adjacent.to.moved)

Consolidated = precinct.data$Precinct[grep("/", precinct.data$Precinct)]
precinct.data$consolidated = as.numeric(precinct.data$Precinct %in% Consolidated)

precinct.data = subset(precinct.data, select=c(Precinct, turnout.diff, Reg89, Turnout89, Reg87, Turnout87, new.pp, placebo, consolidated, quake_disruption))

all.data = merge(precinct.data, precinct.census.data)

all.data$turnout.diff = all.data$turnout.diff * 100
all.data$pct_over65 = all.data$pct_over65 * 100
all.data$pct_college = all.data$pct_college * 100
all.data$pct_white = all.data$pct_white * 100
all.data$pct_black = all.data$pct_black * 100
all.data$pct_asian = all.data$pct_asian * 100
all.data$pct_female = all.data$pct_female * 100

all.data$turnout.87 = (all.data$Turnout87/all.data$Reg87) * 100
all.data$turnout.89 = (all.data$Turnout89/all.data$Reg89) * 100

write.csv(all.data, "SF_ne_data.csv", row.names=FALSE)

